NAME: FRANCIS KWAME SEGBE¶

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pandas_profiling import ProfileReport
import plotly.offline as py
C:\Users\HP\AppData\Local\Temp\ipykernel_5464\3042834768.py:5: DeprecationWarning: `import pandas_profiling` is going to be deprecated by April 1st. Please use `import ydata_profiling` instead.
  from pandas_profiling import ProfileReport
In [4]:
# Loading the dataset.
data = pd.read_csv(r"C:\Users\HP\Desktop\Projects File\Customer Churn\WA_Fn-UseC_-Telco-Customer-Churn.csv")
data
Out[4]:
customerID gender SeniorCitizen Partner Dependents tenure PhoneService MultipleLines InternetService OnlineSecurity ... DeviceProtection TechSupport StreamingTV StreamingMovies Contract PaperlessBilling PaymentMethod MonthlyCharges TotalCharges Churn
0 7590-VHVEG Female 0 Yes No 1 No No phone service DSL No ... No No No No Month-to-month Yes Electronic check 29.85 29.85 No
1 5575-GNVDE Male 0 No No 34 Yes No DSL Yes ... Yes No No No One year No Mailed check 56.95 1889.5 No
2 3668-QPYBK Male 0 No No 2 Yes No DSL Yes ... No No No No Month-to-month Yes Mailed check 53.85 108.15 Yes
3 7795-CFOCW Male 0 No No 45 No No phone service DSL Yes ... Yes Yes No No One year No Bank transfer (automatic) 42.30 1840.75 No
4 9237-HQITU Female 0 No No 2 Yes No Fiber optic No ... No No No No Month-to-month Yes Electronic check 70.70 151.65 Yes
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
7038 6840-RESVB Male 0 Yes Yes 24 Yes Yes DSL Yes ... Yes Yes Yes Yes One year Yes Mailed check 84.80 1990.5 No
7039 2234-XADUH Female 0 Yes Yes 72 Yes Yes Fiber optic No ... Yes No Yes Yes One year Yes Credit card (automatic) 103.20 7362.9 No
7040 4801-JZAZL Female 0 Yes Yes 11 No No phone service DSL Yes ... No No No No Month-to-month Yes Electronic check 29.60 346.45 No
7041 8361-LTMKD Male 1 Yes No 4 Yes Yes Fiber optic No ... No No No No Month-to-month Yes Mailed check 74.40 306.6 Yes
7042 3186-AJIEK Male 0 No No 66 Yes No Fiber optic Yes ... Yes Yes Yes Yes Two year Yes Bank transfer (automatic) 105.65 6844.5 No

7043 rows × 21 columns

We will start with the Exploratory Data Analyses and Data Cleaning¶

In [5]:
#Pandas Profiling
profile = ProfileReport(data)
profile
# profile.to_file(output_file='report.html')
Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]
Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]
Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]
Out[5]:

In [6]:
# Printing out the info of our columns section to understand a brief summary of dataset.
#This basically does the samething as the pandas_profiling 
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 
 17  PaymentMethod     7043 non-null   object 
 18  MonthlyCharges    7043 non-null   float64
 19  TotalCharges      7043 non-null   object 
 20  Churn             7043 non-null   object 
dtypes: float64(1), int64(2), object(18)
memory usage: 1.1+ MB
In [7]:
#Printing out the shape and size of our dataset.
data.shape
Out[7]:
(7043, 21)
In [8]:
# Convert Churn column. This is to help the machine learning model understand our data.
data['Churn'] = data['Churn'].replace({'Yes': 1, 'No': 0})

print(data['Churn'].head())
0    0
1    0
2    1
3    0
4    1
Name: Churn, dtype: int64
In [9]:
#Converting "No internet Service" for the following to "No", (  OnlineSecurity,StreamingTV,  
# DeviceProtection,TechSupport,OnlineSecurity,StreamingTV)
cols = ['OnlineSecurity','StreamingTV','DeviceProtection','TechSupport','OnlineBackup','StreamingMovies','MultipleLines']

for col in cols:
    data.loc[data[col] == 'No internet service', col] = 'No'

print(data[cols].head())
  OnlineSecurity StreamingTV DeviceProtection TechSupport OnlineBackup  \
0             No          No               No          No          Yes   
1            Yes          No              Yes          No           No   
2            Yes          No               No          No          Yes   
3            Yes          No              Yes         Yes           No   
4             No          No               No          No           No   

  StreamingMovies     MultipleLines  
0              No  No phone service  
1              No                No  
2              No                No  
3              No  No phone service  
4              No                No  
In [10]:
data.loc[data['MultipleLines'] == 'No phone service', 'MultipleLines'] = 'No'

print(data['MultipleLines'].unique())
['No' 'Yes']
In [11]:
# Replace spaces with NaN
data['TotalCharges'] = data['TotalCharges'].replace(' ', np.nan) 

# Drop rows with NaN in TotalCharges
data.dropna(subset=['TotalCharges'], inplace=True)

# Converting Total charge to float 
data['TotalCharges'] = data['TotalCharges'].astype('float32') 
In [12]:
churn_counts = data['Churn'].value_counts()
churn_counts
Out[12]:
0    5163
1    1869
Name: Churn, dtype: int64

Data Visualization¶

In [13]:
# visualizing our data
import plotly.express as px

churn_counts = data['Churn'].value_counts()

fig = px.pie(values=churn_counts.values, 
             names=churn_counts.index,
             title='Percentage of Churn Customers',
             color_discrete_sequence=['lightgrey', 'red'])

fig.show()

From the above we can see that, only 26% of our entire dataset churned.

In [14]:
# Churn rate by Gender.
# fig = px.pie(data, values='Churn', names='gender', 
#              title='Churn Percentage by Gender')

# fig.update_traces(textinfo='percent+label')
# fig.show()
In [15]:
import plotly.express as px

fig = px.histogram(data, x='gender', color='Churn', 
                   labels={'gender': 'Gender'},
                   color_discrete_sequence=['seagreen', 'lightsalmon'],
                   title='Gender vs Churn')

fig.update_layout(xaxis_title='Gender', 
                  yaxis_title='Count',
                  legend_title='Churn',
                  legend_orientation='h')

for i in range(len(fig.data)):
    fig.data[i].name = 'No' if i==0 else 'Yes'

fig.show()

Here we could see that women was slightly likely to churn than men

In [16]:
colors = ['rebeccapurple', 'seagreen']

fig = px.bar(data, x='TechSupport', y='Churn', 
             title='Churn Rate by Tech Support Service',
             color_discrete_sequence=colors)

fig.update_yaxes(tickformat='.0%')
fig.show()

People with "No" tech support is bigger had much higher churn rate than people with "Yes" Techsupport

In [17]:
# We are looking at Churn rate by internet services.
import plotly.express as px

fig = px.bar(data, x='InternetService', y='Churn',
             title='Churn Rate by Internet Service')

fig.update_yaxes(tickformat='.0%')  

fig.show()

People with fibre optics are more likely to churn.

In [18]:
#Churn rate by payment method
colors = ['lightslategray', 'red', 'seagreen']

fig = px.bar(data, x='PaymentMethod', y='Churn', 
             color_discrete_sequence=colors,
             title='Churn Rate by Payment Method')

fig.update_yaxes(tickformat='.0%')

fig.show()
In [19]:
#Churn rate by Contract
import plotly.express as px

fig = px.bar(data, x='Contract', y='Churn',  
             title='Churn Rate by Contract Duration')

fig.update_yaxes(tickformat='.0%')

fig.show()

People with longer duration contract were less likely to churn.

In [20]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

cat_cols = ["gender","Partner","Dependents","PhoneService",
            "MultipleLines","InternetService","OnlineSecurity",
            "OnlineBackup","DeviceProtection","TechSupport",
            "StreamingTV","StreamingMovies","Contract",
            "PaperlessBilling","PaymentMethod","Churn"]

for col in cat_cols:

  le = LabelEncoder()
  
  data[col] = le.fit_transform(data[col])

print(data[cat_cols].head())
   gender  Partner  Dependents  PhoneService  MultipleLines  InternetService  \
0       0        1           0             0              0                0   
1       1        0           0             1              0                0   
2       1        0           0             1              0                0   
3       1        0           0             0              0                0   
4       0        0           0             1              0                1   

   OnlineSecurity  OnlineBackup  DeviceProtection  TechSupport  StreamingTV  \
0               0             1                 0            0            0   
1               1             0                 1            0            0   
2               1             1                 0            0            0   
3               1             0                 1            1            0   
4               0             0                 0            0            0   

   StreamingMovies  Contract  PaperlessBilling  PaymentMethod  Churn  
0                0         0                 1              2      0  
1                0         1                 0              3      0  
2                0         0                 1              3      1  
3                0         1                 0              0      0  
4                0         0                 1              2      1  
In [21]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Load data
# df = pd.read_csv('telco_churn.csv')

# Define columns to scale  
cols_to_scale = ['tenure', 'MonthlyCharges', 'TotalCharges']

# Create scaler 
scaler = MinMaxScaler()

# Fit and transform selected columns
data[cols_to_scale] = scaler.fit_transform(data[cols_to_scale])

# Verify scaled values 
print(data[cols_to_scale].head())
     tenure  MonthlyCharges  TotalCharges
0  0.000000        0.115423      0.001275
1  0.464789        0.385075      0.215867
2  0.014085        0.354229      0.010310
3  0.619718        0.239303      0.210241
4  0.014085        0.521891      0.015330
In [22]:
data.head()
Out[22]:
customerID gender SeniorCitizen Partner Dependents tenure PhoneService MultipleLines InternetService OnlineSecurity ... DeviceProtection TechSupport StreamingTV StreamingMovies Contract PaperlessBilling PaymentMethod MonthlyCharges TotalCharges Churn
0 7590-VHVEG 0 0 1 0 0.000000 0 0 0 0 ... 0 0 0 0 0 1 2 0.115423 0.001275 0
1 5575-GNVDE 1 0 0 0 0.464789 1 0 0 1 ... 1 0 0 0 1 0 3 0.385075 0.215867 0
2 3668-QPYBK 1 0 0 0 0.014085 1 0 0 1 ... 0 0 0 0 0 1 3 0.354229 0.010310 1
3 7795-CFOCW 1 0 0 0 0.619718 0 0 0 1 ... 1 1 0 0 1 0 0 0.239303 0.210241 0
4 9237-HQITU 0 0 0 0 0.014085 1 0 1 0 ... 0 0 0 0 0 1 2 0.521891 0.015330 1

5 rows × 21 columns

In [23]:
data.columns
Out[23]:
Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

Training our Model¶

In [24]:
#Features variables for x and y
y = data['Churn'] 
X = data.drop(columns=['Churn','customerID'])
In [25]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
In [26]:
# Split data into train and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) 

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
(4711, 19) (4711,)
(2321, 19) (2321,)
In [27]:
# Import performance metrics
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

# Split data into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# List of models
models = [DecisionTreeClassifier(), LogisticRegression(), RandomForestClassifier(), GaussianNB()]

for model in models:
  
  # Fit model
  model.fit(X_train, y_train) 
  
  # Make predictions
  y_pred = model.predict(X_test)

  # Evaluate performance
  print(classification_report(y_test, y_pred))
  print(confusion_matrix(y_test, y_pred))
  print(accuracy_score(y_test, y_pred))
              precision    recall  f1-score   support

           0       0.81      0.80      0.81      1549
           1       0.47      0.49      0.48       561

    accuracy                           0.72      2110
   macro avg       0.64      0.65      0.64      2110
weighted avg       0.72      0.72      0.72      2110

[[1241  308]
 [ 285  276]]
0.718957345971564
              precision    recall  f1-score   support

           0       0.84      0.90      0.87      1549
           1       0.64      0.52      0.57       561

    accuracy                           0.80      2110
   macro avg       0.74      0.71      0.72      2110
weighted avg       0.79      0.80      0.79      2110

[[1388  161]
 [ 270  291]]
0.795734597156398
              precision    recall  f1-score   support

           0       0.83      0.89      0.86      1549
           1       0.63      0.49      0.55       561

    accuracy                           0.79      2110
   macro avg       0.73      0.69      0.70      2110
weighted avg       0.77      0.79      0.78      2110

[[1386  163]
 [ 288  273]]
0.7862559241706161
              precision    recall  f1-score   support

           0       0.88      0.76      0.82      1549
           1       0.52      0.72      0.61       561

    accuracy                           0.75      2110
   macro avg       0.70      0.74      0.71      2110
weighted avg       0.79      0.75      0.76      2110

[[1184  365]
 [ 159  402]]
0.7516587677725118
In [34]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
In [35]:
# Best params from tuning 
param_grid = {
  'n_estimators': [100, 200],
  'max_features': ['sqrt'],
  'min_samples_split': [2, 5],
  'min_samples_leaf': [1, 2]
}
In [36]:
from sklearn.model_selection import GridSearchCV

rf = RandomForestClassifier()
grid = GridSearchCV(rf, param_grid, cv=5)
grid.fit(X_train, y_train)
Out[36]:
GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'max_features': ['sqrt'], 'min_samples_leaf': [1, 2],
                         'min_samples_split': [2, 5],
                         'n_estimators': [100, 200]})
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'max_features': ['sqrt'], 'min_samples_leaf': [1, 2],
                         'min_samples_split': [2, 5],
                         'n_estimators': [100, 200]})
RandomForestClassifier()
RandomForestClassifier()
In [37]:
best_params = grid.best_params_
rf = RandomForestClassifier(**best_params)
rf.fit(X_train, y_train)
Out[37]:
RandomForestClassifier(min_samples_split=5, n_estimators=200)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(min_samples_split=5, n_estimators=200)
In [38]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(rf, X_train, y_train, cv=5)
print("Cross Validation Accuracy:", np.mean(scores))
Cross Validation Accuracy: 0.7978498617473485
In [39]:
y_pred = rf.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)

print("Test Accuracy:", test_accuracy)
print("Cross Validation Accuracy:", np.mean(scores))
Test Accuracy: 0.7933649289099526
Cross Validation Accuracy: 0.7978498617473485
In [ ]: